#############################################
# Microarray Basics
#############################################

library(affy)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Warning in read.dcf(con): URL 'http://bioconductor.org/BiocInstaller.dcf':
## status was 'Couldn't resolve host name'
library(affydata)
##      Package    LibPath                                Item      
## [1,] "affydata" "/Users/dancikg/Library/R/3.4/library" "Dilution"
##      Title                        
## [1,] "AffyBatch instance Dilution"
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
############################################################
# Look at the Dilution dataset: this contains 4 liver
# tissue hybridized in concentrationsof 10 and 20 micrograms
# to scanner 1(A) and scanner 2(B). Note that Dilution is an
# object of type AffyBatch (or an extension of an 
# ExpressionSet (eSet) object)
######################################################

data(Dilution)
Dilution
## Warning: replacing previous import 'AnnotationDbi::tail' by 'utils::tail'
## when loading 'hgu95av2cdf'
## Warning: replacing previous import 'AnnotationDbi::head' by 'utils::head'
## when loading 'hgu95av2cdf'
## 
## AffyBatch object
## size of arrays=640x640 features (35221 kb)
## cdf=HG_U95Av2 (12625 affyids)
## number of samples=4
## number of genes=12625
## annotation=hgu95av2
## notes=
####################################################
# Methods (functions) for AffyBatch objects
####################################################
sampleNames(Dilution)     # the names of the samples
## [1] "20A" "20B" "10A" "10B"
experimentData(Dilution)  # experiment information
## Experiment data
##   Experimenter name: Gene Logic 
##   Laboratory: Gene Logic 
##   Contact information: 708 Quince Orchard Road
## Gaithersburg, MD 20878
## Telephone: 1.301.987.1700
## Toll Free: 1.800.GENELOGIC (US and Canada)
## Facsimile: 1.301.987.1701
##  
##   Title: Small part of dilution study 
##   URL: http://qolotus02.genelogic.com/datasets.nsf/ 
##   PMIDs:  
## 
##   Abstract: A 68 word abstract is available. Use 'abstract' method.
##   notes:
##    :     
## 
annotation(Dilution)      # annotation (the microarray used)
## [1] "hgu95av2"
phenoData(Dilution)       # get phenotypic (clinical) data
## An object of class 'AnnotatedDataFrame'
##   sampleNames: 20A 20B 10A 10B
##   varLabels: liver sn19 scanner
##   varMetadata: labelDescription
pData(Dilution) # phenotypic data in table form
##     liver sn19 scanner
## 20A    20    0       1
## 20B    20    0       2
## 10A    10    0       1
## 10B    10    0       2
varMetadata(Dilution) # description of phenotypic data
##                                                               labelDescription
## liver                    amount of liver RNA hybridized to array in micrograms
## sn19    amount of central nervous system RNA hybridized to array in micrograms
## scanner                                              ID number of scanner used
####################################################
# Let's look at each microarray
####################################################
image(Dilution, col = heat.colors(500))

####################################################
# Produces a boxplot of log base 2 intensities # 
# Does it seem fair to compare gene expression 
# across concentrations or scanners?
####################################################
boxplot(Dilution, col = 1:4, ylab = "log2 expression")

####################################################
# Normalization of microarray data involves
# 1. Background correction to remove noise
# 2. Normalization of samples
# 3. Estimation of "average" probe intensity values
####################################################

################################################################
# We will use Robust Multi-array average (RMA) which involves 
# 1. Background correction of probe level intensity values
# 2. Quantile Normalization
# 3. Estimation of average probe set values on log2 scale
################################################################

Dilution.rma <- rma(Dilution)        # perform RMA
## Background correcting
## Normalizing
## Calculating Expression
Dilution.expr <- exprs(Dilution.rma) # extract the expression values

################################################################
# confirm that samples are quantile normalized
################################################################
boxplot(Dilution.expr, col = 2:5, ylab = "log2 expression",
        main = "Small part of dilution study (after RMA normalization)",
        xlab = "Sample")

################################################################
# Let's look at a leukemiadataset, and compare 
# acute lymphoblastic leukemia (ALL) samples with healthy, 
# non-leukemia (noL) bone marrow samples
################################################################
library(leukemiasEset)
data(leukemiasEset)

################################################################
# Extract phenotype data. How many samples of each leukemia 
# type is there?
################################################################
leukemia.p <- pData(leukemiasEset)
table(leukemia.p$LeukemiaType)
## 
## ALL AML CLL CML NoL 
##  12  12  12  12  12
################################################################
# This data is already processed using RMA, so we can extract
# the expression data
################################################################
leukemia.expr <- exprs(leukemiasEset) 

# confirm that data has been processed #
boxplot(leukemia.expr, main = "Leukemia samples", ylab = "log2 expression")

######################################################################
# Let's compare expression between ALL and healthy bone marrow 
# samples for for the following probes:

# ENSG00000171960 - corresponds to gene PPIH 
#       (https://www.ncbi.nlm.nih.gov/gene/10465)
# ENSG00000135679 - corresponds to gene MDM2
#       (https://www.ncbi.nlm.nih.gov/gene/4193)

# We want to calculuate the fold change (see below) and the p-value
# evaluating whether or not any difference in expression between
# ALL and healthy samples are statistically significant
# (in other words, are the genes differentially expressed?)
######################################################################

# find expression for desired probe
m <- match("ENSG00000171960", rownames(leukemia.expr))
m
## [1] 12857
df <- data.frame(expr = leukemia.expr[m,], type = leukemia.p$LeukemiaType)

ggplot(df,aes(type, expr, fill = type)) + geom_boxplot() +
  theme_classic() + theme(legend.position = "none") + 
  labs(x = "Leukemia Type", y = "Log2 Expression") +
  ggtitle("PPIH (ENSG00000171960) expression")

# let's only look at ALL and NoL
df <- filter(df, type == "ALL" | type == "NoL")
ggplot(df,aes(type, expr, fill = type)) + geom_boxplot() +
  theme_classic() + theme(legend.position = "none") + 
  labs(x = "Leukemia Type", y = "Log2 Expression") +
  ggtitle("PPIH (ENSG00000171960) expression")

########################################################################
# Calculate fold change (FC) which is average expression in the 
# first group divided by the average expression in the second group,
# Since data is on the log2 scale, we must convert back to normal scale
########################################################################

# returns a list of values with elements of the first vector split by
# elements of the second vector; if the second argument is a factor
# and drop is TRUE, we drop levels that do not occur
s <- split(df$expr, df$type, drop = TRUE)  

l <- lapply(s, mean)
logFC <- l$ALL - l$NoL
FC <- 2**logFC  ## data is on log2 scale

main = paste("PPIH (ENSG00000171960) expression\nFC = ", round(FC,2))
ggplot(df,aes(type, expr, fill = type)) + geom_boxplot() +
  theme_classic() + theme(legend.position = "none") + 
  labs(x = "Leukemia Type", y = "Log2 Expression") +
  ggtitle(main)

########################################################################
# Is the difference in fold change statistically significant??
# H0: mu_ALL - mu_normal = 0
# HA: mu_ALL - mu_normal != 0
# where mu_ALL is the mean expression of ALL samples for the 
# probe of interest and mu_normal is the mean expression of
# normal samples
# How do we carry out a hypothesis test comparing two population means?
# (we will do this in class)
########################################################################

res <- t.test(s$ALL, s$NoL)
res
## 
##  Welch Two Sample t-test
## 
## data:  s$ALL and s$NoL
## t = 0.33335, df = 17.201, p-value = 0.7429
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2543382  0.3498908
## sample estimates:
## mean of x mean of y 
##  8.981066  8.933290
res$p.value
## [1] 0.7429002
# we fail to reject the null. There is not sufficient
# evidence that the gene PPIH is differentially expressed
# between ALL and normal samples

########################################################################
# Repeat analysis for probe ENSG00000135679 (MDM2)
########################################################################

# get row index of probe, then create data frame, including only
# ALL and NoL samples
m <- match("ENSG00000135679", rownames(leukemia.expr))
df <- data.frame(expr = leukemia.expr[m,], type = leukemia.p$LeukemiaType)
df <- filter(df, type == "ALL" | type == "NoL")
ggplot(df,aes(type, expr, fill = type)) + geom_boxplot() +
  theme_classic() + theme(legend.position = "none") + 
  labs(x = "Leukemia Type", y = "Log2 Expression") +
  ggtitle("MDM2 (ENSG00000135679) expression")

# Is there evidence that MDM2 is differentially expressed between 
# ALL and normal samples? (we will complete this in class)

s <- split(df$expr, df$type, drop = TRUE)  

l <- lapply(s, mean)
logFC <- l$ALL - l$NoL
FC <- 2**logFC  ## data is on log2 scale

res <- t.test(s$ALL, s$NoL)
res$p.value
## [1] 0.004616965
# because P < 0.05, we reject H0 and conclude that
# MDM2 is differentially expressed between ALL
# and NoL samples